library(ggplot2)
library(stringi)
library(gridExtra)
library(dendextend)
library(kableExtra)
library(psych)
library(limma)
library(tidyverse)
library(preprocessCore)
library(CONSTANd) # install from source: https://github.com/PDiracDelta/CONSTANd/
library(NOMAD) # devtools::install_github("carlmurie/NOMAD")
library(vsn)
This notebook presents isobaric labeling data analysis strategy that includes data-driven normalization.
We will check how varying analysis components [summarization/normalization/differential abundance testing methods] changes end results of a quantitative proteomic study.
source('other_functions.R')
source('plotting_functions.R')
# you should either make a symbolic link in this directory
data.list <- readRDS('input_data.rds')
dat.l <- data.list$dat.l # data in long format
# dat.w <- data.list$dat.w # data in wide format
if ('X' %in% colnames(dat.l)) { dat.l$X <- NULL }
# remove shared peptides
shared.peptides <- dat.l %>% filter(!shared.peptide)
# keep spectra with isolation interference <30 and no missing quantification channels
dat.l <- dat.l %>% filter(isoInterOk & noNAs)
# which proteins were spiked in?
spiked.proteins <- dat.l %>% distinct(Protein) %>% filter(stri_detect(Protein, fixed='ups')) %>% pull %>% as.character
# which peptides were identified in each MS run?
unique.pep=dat.l %>%
group_by(Run) %>%
distinct(Peptide) %>%
mutate(val=1)
unique.pep <- xtabs(val~Peptide+Run, data=unique.pep)
tmp <- apply(unique.pep, 1, function(x) all(x==1))
inner.peptides <- rownames(unique.pep)[tmp]
# specify # of varying component variants and their names
variant.names <- c('quantile', 'CONSTANd', 'NOMAD', 'medianSweeping', 'CycLoessVSN')
n.comp.variants <- length(variant.names)
scale.vec <- c('log', 'log', 'log', 'log', 'log')
# pick reference channel and condition for making plots / doing DEA
referenceChannel <- '127C'
referenceCondition <- '0.5'
# specify colours corresponding to biological conditions
condition.colour <- tribble(
~Condition, ~Colour,
"0.125", 'black',
"0.5", 'blue',
"0.667", 'green',
"1", 'red' )
# create data frame with sample info (distinct Run,Channel, Sample, Condition, Colour)
sample.info <- get_sample_info(dat.l, condition.colour)
channelNames <- remove_factors(unique(sample.info$Channel))
dat.unit.l <- dat.l %>% mutate(response=log2(Intensity)) %>% select(-Intensity)
# switch to wide format
dat.unit.w <- pivot_wider(data = dat.unit.l, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=Channel, values_from=response)
dat.norm.w <- emptyList(variant.names)
We apply quantile normalization to each Run separately, and then re-scale the observations so that the mean observation within in each run is set equal to the mean observation across all runs.
grand_average <- mean(as.matrix(dat.unit.w[,channelNames]))
x.split <- split(dat.unit.w, dat.unit.w$Run)
x.split.norm <- lapply(x.split, function(y) {
# apply normalizeQuantiles to each Run separately
y[,channelNames] <- normalize.quantiles(as.matrix(y[,channelNames]))
# make averages of all runs equal.
y[,channelNames] <- y[,channelNames] / mean(colMeans(y[,channelNames])) * grand_average
return(y)
})
dat.norm.w$quantile <- bind_rows(x.split.norm)
# dat.unit.l entries are in long format so all have same colnames and no channelNames
x.split <- split(dat.unit.w, dat.unit.w$Run) # apply CONSTANd to each Run separately
x.split.norm <- lapply(x.split, function(y) {
y[,channelNames] <- CONSTANd(y[,channelNames])$normalized_data
return(y)
})
dat.norm.w$CONSTANd <- bind_rows(x.split.norm)
We apply NOMAD on the PSM level instead of the peptide level.
# doRobust=F: use means, like CONSTANd; doLog=F: values are already transformed.
dat.nomadnorm <- nomadNormalization(dat.unit.l$response, dat.unit.l %>% rename(iTRAQ=Channel) %>% as.data.frame, doRobust = FALSE, doiTRAQCorrection = FALSE, doLog = FALSE)
## Running normalization with 464224 number of data points
## Normalizing for factor: Peptide
## Normalizing for factor: Run
## Normalizing for factor: iTRAQ
## Normalizing for factor: Run Peptide
## Normalizing for factor: Run iTRAQ
dat.nomadnorm$x$response <- dat.nomadnorm$y
dat.norm.w$NOMAD <- pivot_wider(data = dat.nomadnorm$x, id_cols=-one_of(c('Condition', 'BioReplicate')), names_from=iTRAQ, values_from=response)
# subtract the spectrum median log2intensity from the observed log2intensities
dat.norm.w$medianSweeping <- dat.unit.w
dat.norm.w$medianSweeping[,channelNames] <- sweep(dat.norm.w$medianSweeping[,channelNames], 1, apply(dat.norm.w$medianSweeping[,channelNames], 1, median) )
x.split <- split(dat.unit.w, dat.unit.w$Run)
x.split.norm <- lapply(x.split, function(y) {
y[,channelNames] <- normalizeCyclicLoess(y[,channelNames], span=0.005, iterations=15)
return(y)
})
dat.norm.w$CycLoessVSN <- bind_rows(x.split.norm)
vsn.tmp <- vsn2(as.matrix(dat.norm.w$CycLoessVSN[,channelNames]), strata=dat.norm.w$CycLoessVSN$Run)
dat.norm.w$CycLoessVSN[,channelNames] <- vsn.tmp@hx
Summarize quantification values from PSM to peptide (first step) to protein (second step).
# normalized data
dat.norm.summ.w <- lapply(dat.norm.w, function(x) {
# group by (run,)protein,peptide then summarize twice (once on each level)
# add select() statement because summarise_at is going bananas over character columns
y <- x %>% group_by(Run, Protein, Peptide) %>% select(Run, Protein, Peptide, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% select(Run, Protein, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% ungroup()
return(y)
})
Let’s also summarize the non-normalized data for in the comparison section.
# non-normalized data
# add select() statement because summarise_at is going bananas over character columns
dat.nonnorm.summ.w <- dat.unit.w %>% group_by(Run, Protein, Peptide) %>% select(Run, Protein, Peptide, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% select(Run, Protein, channelNames) %>% summarise_at(.vars = channelNames, .funs = median) %>% ungroup()
# make data completely wide (also across runs)
## normalized data
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
return(x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}"))
})
colnames(dat.norm.summ.w2)
## NULL
# apply normalizeQuantiles again, now to the data from all runs simultaneously
dat.norm.summ.w2$quantile[,sample.info$Sample] <- normalize.quantiles(as.matrix(dat.norm.summ.w2$quantile[,sample.info$Sample]))
dat.norm.summ.w2
## $quantile
## # A tibble: 4,083 x 33
## Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 A0AVT1 14.2 14.7 15.0 14.0
## 2 A0FGR8 14.2 14.8 15.2 14.4
## 3 A0MZ66 14.9 14.7 13.9 14.5
## 4 A1L0T0 14.3 13.6 NA 14.2
## 5 A3KMH1 11.0 11.1 10.5 NA
## 6 A4D1E9 15.2 15.0 NA 14.4
## 7 A5YKK6 14.6 NA NA 14.9
## 8 A6NDG6 10.3 12.2 10.4 15.5
## 9 A6NDU8 13.5 13.8 13.4 13.0
## 10 A6NHQ2 16.6 14.7 NA NA
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## # `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## # `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## # `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## # `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## # `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## # `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## # `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## # `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## # `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
##
## $CONSTANd
## # A tibble: 4,083 x 33
## Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 A0AVT1 0.999 0.991 1.00 1.00
## 2 A0FGR8 0.993 1.00 1.00 0.999
## 3 A0MZ66 0.997 1.01 1.00 0.997
## 4 A1L0T0 1.01 1.00 NA 1.01
## 5 A3KMH1 1.05 1.03 0.999 NA
## 6 A4D1E9 1.01 0.998 NA 1.00
## 7 A5YKK6 0.995 NA NA 0.991
## 8 A6NDG6 0.999 0.993 0.995 0.998
## 9 A6NDU8 1.00 0.997 1.01 1.01
## 10 A6NHQ2 0.995 0.982 NA NA
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## # `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## # `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## # `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## # `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## # `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## # `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## # `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## # `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## # `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
##
## $NOMAD
## # A tibble: 4,083 x 33
## Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 A0AVT1 -0.00791 -0.125 0.00800 0.0331
## 2 A0FGR8 -0.104 0.0339 -0.00415 -0.0203
## 3 A0MZ66 -0.0427 0.120 0.00582 -0.0431
## 4 A1L0T0 0.0953 0.0311 NA 0.152
## 5 A3KMH1 0.549 0.335 0.00572 NA
## 6 A4D1E9 0.187 -0.0381 NA 0.0706
## 7 A5YKK6 -0.0721 NA NA -0.145
## 8 A6NDG6 -0.00381 -0.0785 -0.0302 -0.0421
## 9 A6NDU8 0.0250 -0.0416 0.166 0.136
## 10 A6NHQ2 -0.0905 -0.279 NA NA
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## # `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## # `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## # `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## # `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## # `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## # `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## # `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## # `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## # `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
##
## $medianSweeping
## # A tibble: 4,083 x 33
## Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 A0AVT1 -0.0215 -0.140 -0.0269 0.00507
## 2 A0FGR8 -0.130 0.0122 -0.0618 -0.0824
## 3 A0MZ66 -0.0429 0.0990 -0.0530 -0.102
## 4 A1L0T0 0.0111 0.00937 NA 0.0561
## 5 A3KMH1 0.563 0.216 -0.0436 NA
## 6 A4D1E9 0.169 -0.0629 NA 0.00864
## 7 A5YKK6 -0.0737 NA NA -0.239
## 8 A6NDG6 -0.0733 -0.0701 -0.129 -0.0980
## 9 A6NDU8 -0.00860 -0.105 0.0997 0.0165
## 10 A6NHQ2 -0.120 -0.208 NA NA
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## # `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## # `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## # `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## # `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## # `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## # `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## # `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## # `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## # `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
##
## $CycLoessVSN
## # A tibble: 4,083 x 33
## Protein `Mixture1_1:127… `Mixture1_2:127… `Mixture2_1:127… `Mixture2_2:127…
## <fct> <dbl> <dbl> <dbl> <dbl>
## 1 A0AVT1 9.92 9.81 9.90 10.1
## 2 A0FGR8 9.92 9.81 9.90 10.1
## 3 A0MZ66 9.92 9.81 9.90 10.1
## 4 A1L0T0 9.92 9.81 NA 10.1
## 5 A3KMH1 9.92 9.81 9.89 NA
## 6 A4D1E9 9.92 9.81 NA 10.1
## 7 A5YKK6 9.92 NA NA 10.1
## 8 A6NDG6 9.92 9.81 9.89 10.1
## 9 A6NDU8 9.92 9.81 9.90 10.1
## 10 A6NHQ2 9.93 9.81 NA NA
## # … with 4,073 more rows, and 28 more variables: `Mixture1_1:127N` <dbl>,
## # `Mixture1_2:127N` <dbl>, `Mixture2_1:127N` <dbl>, `Mixture2_2:127N` <dbl>,
## # `Mixture1_1:128C` <dbl>, `Mixture1_2:128C` <dbl>, `Mixture2_1:128C` <dbl>,
## # `Mixture2_2:128C` <dbl>, `Mixture1_1:128N` <dbl>, `Mixture1_2:128N` <dbl>,
## # `Mixture2_1:128N` <dbl>, `Mixture2_2:128N` <dbl>, `Mixture1_1:129C` <dbl>,
## # `Mixture1_2:129C` <dbl>, `Mixture2_1:129C` <dbl>, `Mixture2_2:129C` <dbl>,
## # `Mixture1_1:129N` <dbl>, `Mixture1_2:129N` <dbl>, `Mixture2_1:129N` <dbl>,
## # `Mixture2_2:129N` <dbl>, `Mixture1_1:130C` <dbl>, `Mixture1_2:130C` <dbl>,
## # `Mixture2_1:130C` <dbl>, `Mixture2_2:130C` <dbl>, `Mixture1_1:130N` <dbl>,
## # `Mixture1_2:130N` <dbl>, `Mixture2_1:130N` <dbl>, `Mixture2_2:130N` <dbl>
# medianSweeping: in each channel, subtract median computed across all proteins within the channel, separately for each MS run.
x.split <- split(dat.norm.summ.w$medianSweeping, dat.norm.summ.w$medianSweeping$Run)
x.split.norm <- lapply(x.split, function(y) {
y[,channelNames] <- sweep(y[,channelNames], 2, apply(y[,channelNames], 2, median) )
return(y)
})
dat.norm.summ.w$medianSweeping <- bind_rows(x.split.norm)
# additional VSN on protein level data applied to all runs
vsn.tmp <- vsn2(as.matrix(dat.norm.summ.w2$CycLoessVSN %>% select(-Protein)))
dat.norm.summ.w2$CycLoessVSN[, colnames(dat.norm.summ.w2$CycLoessVSN)!='Protein'] <- vsn.tmp@hx
# make data completely wide (also across runs)
## non-normalized data
dat.nonnorm.summ.w2 <- dat.nonnorm.summ.w %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_glue = "{Run}:{.value}")
## normalized data
# we already did this earlier, right before the second stage of quantile normalization!
dat.norm.summ.w2 <- lapply(dat.norm.summ.w, function(x) {
dat.tmp <- x %>% pivot_wider(names_from = Run, values_from = all_of(channelNames), names_sep = ":")
dat.tmp <- flip_colnames(dat.tmp, 'Protein')
return(dat.tmp)
})
# use (half-)wide format
par(mfrow=c(3,2))
boxplot_w(dat.nonnorm.summ.w,sample.info, 'Raw')
for (i in 1: n.comp.variants){
boxplot_w(dat.norm.summ.w[[i]], sample.info, paste('Normalized', variant.names[i], sep='_'))
}
par(mfrow=c(1, 1))
MA plots of two single samples taken from condition 1 and condition 0.125, measured in different MS runs (samples Mixture2_1:127C and Mixture1_2:129N, respectively).
par(mfrow=c(3,2))
# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function
# use wide2 format
p <- vector('list', n.comp.variants+1)
p[[1]] <- maplot_ils(dat.nonnorm.summ.w2, 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], 'Raw')
for (i in 1: n.comp.variants){
p[[i+1]]<- maplot_ils(dat.norm.summ.w2[[i]], 'Mixture2_1:127C', 'Mixture1_2:129N', scale.vec[i], paste('Normalized', variant.names[i], sep='_'))
}
grid.arrange(grobs = p, ncol=2, nrow=3)
par(mfrow=c(1, 1))
MA plots of all samples from condition 1 and condition 0.125 (quantification values averaged within condition).
par(mfrow=c(3,2))
# different unit variants require different computation of fold changes and average abuandance: additive or multiplicative scale; see maplot_ils function
channels.num <- sample.info %>% filter(Condition=='1') %>% select(Sample) %>% pull
channels.denom <- sample.info %>% filter(Condition=='0.125') %>% select(Sample) %>% pull
p <- vector('list', n.comp.variants+1)
p[[1]] <- maplot_ils(dat.nonnorm.summ.w2, channels.num, channels.denom, scale.vec[i], 'Raw')
for (i in 1: n.comp.variants){
p[[i+1]]<- maplot_ils(dat.norm.summ.w2[[i]], channels.num, channels.denom, scale.vec[i], paste('Normalized', variant.names[i], sep='_'))
}
grid.arrange(grobs = p, ncol=2, nrow=3)
par(mfrow=c(1, 1))
par(mfrow=c(3,2))
dat.nonnorm.summ.l <- lapply(list(dat.nonnorm.summ.w), function(x){
x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
x <- to_long_format(x, sample.info)
})
dat.norm.summ.l <- lapply(dat.norm.summ.w, function(x){
x$Mixture <- unlist(lapply(stri_split(x$Run,fixed='_'), function(y) y[1]))
x <- to_long_format(x, sample.info)
})
par(mfrow=c(2, 2))
cvplot_ils(dat=dat.nonnorm.summ.l[[1]], feature.group='Protein', xaxis.group='Condition',
title='Raw', abs=F)
for (i in 1: n.comp.variants){
cvplot_ils(dat=dat.norm.summ.l[[i]], feature.group='Protein', xaxis.group='Condition',
title=paste('Normalized', variant.names[i], sep='_'), abs=F)
}
par(mfrow=c(1, 1))
par(mfrow=c(3,2))
pcaplot_ils(dat.nonnorm.summ.w2 %>% select(-'Protein'), info=sample.info, 'Raw')
for (i in seq_along(dat.norm.summ.w2)){
# select only the spiked.proteins
pcaplot_ils(dat.norm.summ.w2[[i]] %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}
par(mfrow=c(1, 1))
par(mfrow=c(3,2))
pcaplot_ils(dat.nonnorm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, 'Raw')
for (i in seq_along(dat.norm.summ.w2)){
# select only the spiked.proteins
pcaplot_ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-'Protein'), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}
par(mfrow=c(1, 1))
Only use spiked proteins
par(mfrow=c(3,2))
dendrogram_ils(dat.nonnorm.summ.w2 %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, 'Raw')
for (i in seq_along(dat.norm.summ.w2)){
dendrogram_ils(dat.norm.summ.w2[[i]] %>% filter(Protein %in% spiked.proteins) %>% select(-Protein), info=sample.info, paste('Normalized', variant.names[i], sep='_'))
}
par(mfrow=c(1, 1))
TODO: - Also try to log-transform the intensity case, to see if there are large differences in the t-test results. - done. remove this code? NOTE: - actually, lmFit (used in moderated_ttest) was built for log2-transformed data. However, supplying untransformed intensities can also work. This just means that the effects in the linear model are also additive on the untransformed scale, whereas for log-transformed data they are multiplicative on the untransformed scale. Also, there may be a bias which occurs from biased estimates of the population means in the t-tests, as mean(X) is not equal to exp(mean(log(X))).
design.matrix <- get_design_matrix(referenceCondition, sample.info)
dat.dea <- emptyList(names(dat.norm.summ.w2))
for (i in seq_along(dat.norm.summ.w2)) {
this_scale <- scale.vec[match(names(dat.dea)[i], variant.names)]
d <- column_to_rownames(as.data.frame(dat.norm.summ.w2[[i]]), 'Protein')
dat.dea[[i]] <- moderated_ttest(dat=d, design.matrix, scale=this_scale)
}
# also see what the unnormalized results would look like
n.comp.variants <- n.comp.variants + 1
variant.names <- c(variant.names, 'raw')
scale.vec <- c(scale.vec, 'raw')
dat.dea$raw <- moderated_ttest(dat=column_to_rownames(dat.nonnorm.summ.w2, 'Protein'), design.matrix, scale='raw')
cm <- conf_mat(dat.dea, 'q.mod', 0.05, spiked.proteins)
print_conf_mat(cm)
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4064 | 15 | 4061 | 5 | 4062 | 4 | 4061 | 4 | 4064 | 19 | 4064 | 15 |
| DE | 0 | 4 | 3 | 14 | 2 | 15 | 3 | 15 | 0 | 0 | 0 | 4 |
| quantile | CONSTANd | NOMAD | medianSweeping | CycLoessVSN | raw | |
|---|---|---|---|---|---|---|
| Accuracy | 0.996 | 0.998 | 0.999 | 0.998 | 0.995 | 0.996 |
| Sensitivity | 0.211 | 0.737 | 0.789 | 0.789 | 0.000 | 0.211 |
| Specificity | 1.000 | 0.999 | 1.000 | 0.999 | 1.000 | 1.000 |
| PPV | 1.000 | 0.824 | 0.882 | 0.833 | NaN | 1.000 |
| NPV | 0.996 | 0.999 | 0.999 | 0.999 | 0.995 | 0.996 |
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4064 | 19 | 4064 | 15 | 4064 | 17 | 4062 | 17 | 4064 | 19 | 4064 | 19 |
| DE | 0 | 0 | 0 | 4 | 0 | 2 | 2 | 2 | 0 | 0 | 0 | 0 |
| quantile | CONSTANd | NOMAD | medianSweeping | CycLoessVSN | raw | |
|---|---|---|---|---|---|---|
| Accuracy | 0.995 | 0.996 | 0.996 | 0.995 | 0.995 | 0.995 |
| Sensitivity | 0.000 | 0.211 | 0.105 | 0.105 | 0.000 | 0.000 |
| Specificity | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 | 1.000 |
| PPV | NaN | 1.000 | 1.000 | 0.500 | NaN | NaN |
| NPV | 0.995 | 0.996 | 0.996 | 0.996 | 0.995 | 0.995 |
| background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | background | spiked | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| not_DE | 4064 | 17 | 4059 | 5 | 4062 | 5 | 4060 | 5 | 4064 | 19 | 4064 | 17 |
| DE | 0 | 2 | 5 | 14 | 2 | 14 | 4 | 14 | 0 | 0 | 0 | 2 |
| quantile | CONSTANd | NOMAD | medianSweeping | CycLoessVSN | raw | |
|---|---|---|---|---|---|---|
| Accuracy | 0.996 | 0.998 | 0.998 | 0.998 | 0.995 | 0.996 |
| Sensitivity | 0.105 | 0.737 | 0.737 | 0.737 | 0.000 | 0.105 |
| Specificity | 1.000 | 0.999 | 1.000 | 0.999 | 1.000 | 1.000 |
| PPV | 1.000 | 0.737 | 0.875 | 0.778 | NaN | 1.000 |
| NPV | 0.996 | 0.999 | 0.999 | 0.999 | 0.995 | 0.996 |
TO DO: Piotr: constant NOMAD i RAW q-values (approx. 1) generate error in scatterplots
# character vectors containing logFC and p-values columns
dea.cols <- colnames(dat.dea[[1]])
logFC.cols <- dea.cols[stri_detect_fixed(dea.cols, 'logFC')]
q.cols <- dea.cols[stri_detect_fixed(dea.cols, 'q.mod')]
n.contrasts <- length(logFC.cols)
#scatterplot_ils(dat.dea, q.cols, 'p-values') # commented due to error, sd=0 for NOMAD and RAW
scatterplot_ils(dat.dea, logFC.cols, 'log2FC')
for (i in 1:n.contrasts){
volcanoplot_ils(dat.dea, i, spiked.proteins)
}
Let’s see whether the spiked protein fold changes make sense
# plot theoretical value (horizontal lines) and violin per condition
dat.spiked.logfc <- lapply(dat.dea, function(x) x[spiked.proteins,logFC.cols])
dat.spiked.logfc.l <- lapply(dat.spiked.logfc, function(x) {
x %>% rename_with(function(y) sapply(y, function(z) strsplit(z, '_')[[1]][2])) %>% pivot_longer(cols = everything(), names_to = 'condition', values_to = 'logFC') %>% add_column(Protein=rep(rownames(x), each=length(colnames(x)))) })
violinplot_ils(lapply(dat.spiked.logfc.l, filter, condition != referenceCondition))
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 18.04.5 LTS
##
## Matrix products: default
## BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.7.1
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.7.1
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=de_BE.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=de_BE.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=de_BE.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=de_BE.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] parallel stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] vsn_3.57.0 Biobase_2.49.1 BiocGenerics_0.35.4
## [4] NOMAD_0.99.0 dplR_1.7.1 CONSTANd_0.99.4
## [7] preprocessCore_1.51.0 forcats_0.5.0 stringr_1.4.0
## [10] dplyr_1.0.2 purrr_0.3.4 readr_1.4.0
## [13] tidyr_1.1.2 tibble_3.0.4 tidyverse_1.3.0
## [16] limma_3.45.19 psych_2.0.9 kableExtra_1.3.1
## [19] dendextend_1.14.0 gridExtra_2.3 stringi_1.5.3
## [22] ggplot2_3.3.2
##
## loaded via a namespace (and not attached):
## [1] colorspace_1.4-1 ellipsis_0.3.1 class_7.3-17
## [4] fs_1.5.0 rstudioapi_0.11 farver_2.0.3
## [7] affyio_1.59.0 prodlim_2019.11.13 fansi_0.4.1
## [10] lubridate_1.7.9 xml2_1.3.2 codetools_0.2-16
## [13] splines_4.0.3 R.methodsS3_1.8.1 mnormt_2.0.2
## [16] knitr_1.30 jsonlite_1.7.1 pROC_1.16.2
## [19] caret_6.0-86 broom_0.7.2 dbplyr_1.4.4
## [22] png_0.1-7 R.oo_1.24.0 BiocManager_1.30.10
## [25] compiler_4.0.3 httr_1.4.2 backports_1.1.10
## [28] assertthat_0.2.1 Matrix_1.2-18 cli_2.1.0
## [31] htmltools_0.5.0 tools_4.0.3 gtable_0.3.0
## [34] glue_1.4.2 affy_1.67.1 reshape2_1.4.4
## [37] Rcpp_1.0.5 cellranger_1.1.0 vctrs_0.3.4
## [40] nlme_3.1-150 iterators_1.0.13 timeDate_3043.102
## [43] xfun_0.18 gower_0.2.2 rvest_0.3.6
## [46] lifecycle_0.2.0 XML_3.99-0.5 zlibbioc_1.35.0
## [49] MASS_7.3-53 scales_1.1.1 ipred_0.9-9
## [52] hms_0.5.3 yaml_2.2.1 rpart_4.1-15
## [55] highr_0.8 foreach_1.5.1 e1071_1.7-4
## [58] lava_1.6.8 rlang_0.4.8 pkgconfig_2.0.3
## [61] matrixStats_0.57.0 evaluate_0.14 lattice_0.20-41
## [64] recipes_0.1.14 labeling_0.4.2 tidyselect_1.1.0
## [67] plyr_1.8.6 magrittr_1.5 R6_2.4.1
## [70] generics_0.0.2 DBI_1.1.0 pillar_1.4.6
## [73] haven_2.3.1 withr_2.3.0 mgcv_1.8-33
## [76] survival_3.2-7 nnet_7.3-14 modelr_0.1.8
## [79] crayon_1.3.4 utf8_1.1.4 tmvnsim_1.0-2
## [82] rmarkdown_2.5 viridis_0.5.1 grid_4.0.3
## [85] readxl_1.3.1 data.table_1.13.2 blob_1.2.1
## [88] ModelMetrics_1.2.2.2 reprex_0.3.0 digest_0.6.27
## [91] webshot_0.5.2 R.utils_2.10.1 signal_0.7-6
## [94] stats4_4.0.3 munsell_0.5.0 viridisLite_0.3.0